CO2 emissions by GDP per capita

gapminder <- read_csv("~/R/Projects/Datasets/gapminder_clean.csv", col_names = TRUE)

styling1 <- list(
  geom_point(),
  scale_x_log10(),
  scale_y_log10(),
  geom_smooth(method = "lm", se = FALSE),
  theme_classic(), 
  labs(
    caption = "Source from Gapminder"
    )
  )

gapminder_1962 <- gapminder %>%
  filter(Year == 1962) 

ggplot(gapminder_1962, aes(gdpPercap, `CO2 emissions (metric tons per capita)`)) + 
  styling1 +
  labs(title = "CO2 emissions by GDP per capita in 1962",
       x = "GDP per capita"
  )

The Pearson correlation of 'CO2 emissions (metric tons per capita)' and gdpPercap in the year 1962 is 0.926 (p = <2.2e-16).

cor.test(gapminder_1962$gdpPercap, gapminder_1962$`CO2 emissions (metric tons per capita)`, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  gapminder_1962$gdpPercap and gapminder_1962$`CO2 emissions (metric tons per capita)`
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8934697 0.9489792
## sample estimates:
##       cor 
## 0.9260817

Q: In what year is the correlation between 'CO2 emissions (metric tons per capita)' and gdpPercap the strongest?

A: 1967 contains the strongest correlation between 'CO2 emissions (metric tons per capita)' and gdpPercap.

# In what year is the correlation between 'CO2 emissions (metric tons per capita)' and gdpPercap the strongest?
corr_gapminder <- gapminder %>%
  group_by(Year) %>%
  select(gdpPercap, `CO2 emissions (metric tons per capita)`, Year) %>%
  summarize(corr = cor(gdpPercap, `CO2 emissions (metric tons per capita)`, use = "complete.obs")) %>%
  arrange(desc(corr))

corr_gapminder
## # A tibble: 10 x 2
##     Year  corr
##    <dbl> <dbl>
##  1  1967 0.939
##  2  1962 0.926
##  3  1972 0.843
##  4  1982 0.817
##  5  1987 0.810
##  6  1992 0.809
##  7  1997 0.808
##  8  2002 0.801
##  9  1977 0.793
## 10  2007 0.720
gapminder_1967 <- gapminder %>%
  filter(Year == 1967)

# interactive scatter plot
ggplotly(ggplot(gapminder_1967, aes(gdpPercap, `CO2 emissions (metric tons per capita)`, size = pop, color = continent)) + 
  styling1 +
  labs(title = "CO2 emissions by GDP per capita in 1967",
       x = "GDP per capita"
       )
  )

Q: What is the relationship between continent and 'Energy use (kg of oil equivalent per capita)'?

A: There is a statistically significant difference [F(4, 516) = 58.316, p = < 2.2e-16] in Energy use (kg of oil equivalent per capita) between continents as determined by one-way ANOVA. Because R-squared is small (~0.3), there is a weak linear relation between the treatment levels and response.

complete_rec <- na.exclude(gapminder)
LM <- lm(`Energy use (kg of oil equivalent per capita)` ~ continent, data = complete_rec)
anova(LM)
## Analysis of Variance Table
## 
## Response: Energy use (kg of oil equivalent per capita)
##            Df     Sum Sq   Mean Sq F value    Pr(>F)    
## continent   4  679754869 169938717  58.316 < 2.2e-16 ***
## Residuals 516 1503687009   2914122                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(LM)
## 
## Call:
## lm(formula = `Energy use (kg of oil equivalent per capita)` ~ 
##     continent, data = complete_rec)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3132.8  -829.8  -296.8   164.9 13249.4 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          664.9      147.5   4.509 8.08e-06 ***
## continentAmericas    693.7      214.5   3.233   0.0013 ** 
## continentAsia        855.7      211.4   4.047 5.97e-05 ***
## continentEurope     2852.5      211.8  13.466  < 2e-16 ***
## continentOceania    3841.4      479.5   8.012 7.59e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1707 on 516 degrees of freedom
## Multiple R-squared:  0.3113, Adjusted R-squared:  0.306 
## F-statistic: 58.32 on 4 and 516 DF,  p-value: < 2.2e-16
#resid_data <- resid(LM)
#qqnorm(resid_data, pch = 1, frame = FALSE)
#qqline(resid_data)

# boxplot
ggplot(complete_rec, aes(continent, `Energy use (kg of oil equivalent per capita)`)) +
  geom_boxplot(color = "red", fill = "orange", alpha = 0.2) +
  theme_classic() +
  labs(
    x = "Continent",
    caption = "Source from Gapminder"
  )

# summary stats
complete_rec %>%
  group_by(continent) %>%
  get_summary_stats(`Energy use (kg of oil equivalent per capita)`, type = "mean_sd")
## # A tibble: 5 x 5
##   continent variable                                         n  mean    sd
##   <chr>     <chr>                                        <dbl> <dbl> <dbl>
## 1 Africa    Energy use (kg of oil equivalent per capita)   134  665.  597.
## 2 Americas  Energy use (kg of oil equivalent per capita)   120 1359. 2041.
## 3 Asia      Energy use (kg of oil equivalent per capita)   127 1521. 1952.
## 4 Europe    Energy use (kg of oil equivalent per capita)   126 3517. 1943.
## 5 Oceania   Energy use (kg of oil equivalent per capita)    14 4506.  816.
# outliers. may try to take out
complete_rec %>%
  group_by(continent) %>%
  identify_outliers(`Energy use (kg of oil equivalent per capita)`) %>%
  select(continent, is.outlier, is.extreme)
## # A tibble: 42 x 3
##    continent is.outlier is.extreme
##    <chr>     <lgl>      <lgl>     
##  1 Africa    TRUE       FALSE     
##  2 Africa    TRUE       TRUE      
##  3 Africa    TRUE       TRUE      
##  4 Africa    TRUE       TRUE      
##  5 Africa    TRUE       TRUE      
##  6 Africa    TRUE       TRUE      
##  7 Africa    TRUE       TRUE      
##  8 Africa    TRUE       TRUE      
##  9 Africa    TRUE       TRUE      
## 10 Africa    TRUE       TRUE      
## # ... with 32 more rows
# check for normality; qqplot and shapiro
ggqqplot(residuals(LM))

shapiro_test(residuals(LM))
## # A tibble: 1 x 3
##   variable      statistic  p.value
##   <chr>             <dbl>    <dbl>
## 1 residuals(LM)     0.710 5.04e-29
# check for normality, but for each group
ggqqplot(complete_rec, "Energy use (kg of oil equivalent per capita)", facet.by = "continent")

# homogenity of variance
plot(LM, 1)

complete_rec %>% levene_test(`Energy use (kg of oil equivalent per capita)` ~ continent)
## # A tibble: 1 x 4
##     df1   df2 statistic           p
##   <int> <int>     <dbl>       <dbl>
## 1     4   516      8.66 0.000000905
#complete_rec$`Energy use (kg of oil equivalent per capita)` <- log10(complete_rec$`Energy use (kg of oil equivalent per capita)`)

#ggdensity(resid_data, x = "Energy use (kg of oil equivalent per capita)", color = "red", fill = "orange", alpha = 0.2) +
#  scale_x_continuous() +
#  stat_overlay_normal_density(color = "red", linetype = "dashed")


#shapiro.test(complete_rec$`Energy use (kg of oil equivalent per capita)`)

Q: Is there a significant difference between Europe and Asia with respect to 'Imports of goods and services (% of GDP)' in the years after 1990?

A: The mean Imports of goods and services (% of GDP) in Asia and Europe is 46.5 (SD = 37.5) and 41.6 (SD = 16.2), respectively. A Welch two-sample t-test demonstrates there is no statistically significant difference [t(91.9) = 1.04, p = 0.3, d = 0.169], meaning the means of the two populations are the same.

Note: To meet the normality assumption not met in the original data, a sqrt transformation was attempted. While the transformation improved the distribution of the data to normality, it provided no meaningful differences as it led to the same conclusion. Therefore, the analysis was done on the original data.

gapminder_post1990 <- complete_rec %>%
  filter(Year > 1990, continent %in% c("Europe", "Asia")) %>%
  select(Year, continent, `Imports of goods and services (% of GDP)`)

# qqplot
ggqqplot(gapminder_post1990, x = "Imports of goods and services (% of GDP)", facet.by = "continent")

# welch t test
t.test <- gapminder_post1990 %>% 
  t_test(`Imports of goods and services (% of GDP)` ~ continent) %>%
  add_significance()
t.test
## # A tibble: 1 x 9
##   .y.                   group1 group2    n1    n2 statistic    df     p p.signif
## * <chr>                 <chr>  <chr>  <int> <int>     <dbl> <dbl> <dbl> <chr>   
## 1 Imports of goods and~ Asia   Europe    73    99      1.04  91.9   0.3 ns
# cohen's d
gapminder_post1990 %>% cohens_d(`Imports of goods and services (% of GDP)` ~ continent, var.equal = FALSE)
## # A tibble: 1 x 7
##   .y.                                group1 group2 effsize    n1    n2 magnitude
## * <chr>                              <chr>  <chr>    <dbl> <int> <int> <ord>    
## 1 Imports of goods and services (% ~ Asia   Europe   0.169    73    99 negligib~
# summary stats
gapminder_post1990 %>%
  group_by(continent) %>%
  get_summary_stats(`Imports of goods and services (% of GDP)`, type = "mean_sd")
## # A tibble: 2 x 5
##   continent variable                                     n  mean    sd
##   <chr>     <chr>                                    <dbl> <dbl> <dbl>
## 1 Asia      Imports of goods and services (% of GDP)    73  46.5  37.5
## 2 Europe    Imports of goods and services (% of GDP)    99  41.6  16.2

Q: What is the country (or countries) that has the highest 'Population density (people per sq. km of land area)' across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)

A: The country with the highest average ranking in 'Population density (people per sq. km of land area)' across all years is Macao SAR, China, followed by Monaco.

gapminder_density_overall <- gapminder %>%
  group_by(`Country Name`) %>%
  summarize(mean_pop_density = mean(`Population density (people per sq. km of land area)`)) %>%
  top_n(5, mean_pop_density) 

gapminder_density_overall
## # A tibble: 5 x 2
##   `Country Name`       mean_pop_density
##   <chr>                           <dbl>
## 1 Gibraltar                       2622.
## 2 Hong Kong SAR, China            5153.
## 3 Macao SAR, China               14732.
## 4 Monaco                         14090.
## 5 Singapore                       4361.
ggplot(gapminder_density_overall, aes(x = reorder(`Country Name`,
                                              -mean_pop_density), 
                                  y = mean_pop_density)) +
  geom_bar(color = "red", fill = "orange", alpha = 0.2, width = 0.7, stat = "identity") +
  theme_classic() +
  labs(
    title = "Countries with the Highest Average Population Density",
    x = "Country Name",
    y = "Average Population Density",
    caption = "Source from Gapminder"
  )

Q: What country (or countries) has shown the greatest increase in 'Life expectancy at birth, total (years)' since 1962?

A: Cambodia has had the greatest increase (2.44) in 'Life expectancy at birth, total (years)'.

gapminder_life_expect <- gapminder %>%
  filter(Year > 1962) %>%
  arrange(Year) %>%
  group_by(`Country Name`) %>%
  mutate(diff_yr = Year - lag(Year),
         diff_life_expect = `Life expectancy at birth, total (years)` - lag(`Life expectancy at birth, total (years)`),
         rate_per_life = (diff_life_expect / diff_yr)/lag(`Life expectancy at birth, total (years)`) * 100) %>%
  summarize(avg_rate_life_expect = mean(rate_per_life, na.rm = TRUE)) %>%
  top_n(5, avg_rate_life_expect)  

gapminder_life_expect
## # A tibble: 5 x 2
##   `Country Name` avg_rate_life_expect
##   <chr>                         <dbl>
## 1 Bhutan                         1.66
## 2 Cambodia                       2.44
## 3 Maldives                       1.53
## 4 Mali                           1.50
## 5 Timor-Leste                    1.55
ggplot(gapminder_life_expect, aes(x = reorder(`Country Name`,
                                              -avg_rate_life_expect), 
                                  y = avg_rate_life_expect)) +
  geom_bar(color = "red", fill = "orange", alpha = 0.2, width = 0.7, stat = "identity") +
  theme_classic() +
  labs(
    title = "Countries with the Greatest Increase in Life Expectancy since 1962",
    x = "Country Name",
    y = "Average Life Expectancy Percent Rate",
    caption = "Source from Gapminder"
  )